Kapitel 6.6: Zeiten¶

Das Notebook ergänzt Kapitel 6.6 'Zeiten'.

Import¶

In [1]:
import pandas as pd
import numpy as np

from resources_statistics import *
from resources_geschichtslyrik import *

import plotly.express as px
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator

from tqdm.notebook import tqdm
In [2]:
meta = pd.read_json(r"../resources/meta.json")

Merkmale hinzufügen¶

In [3]:
meta['zeit_mitte'] = [(x+y)/2 if pd.notna(x) and pd.notna(y) else float('NaN') for x, y in zip(meta['beginn'], meta['ende'])]
meta['dekade_mitte'] = [(x//10)*10 if pd.notna(x) else float('NaN') for x in meta['zeit_mitte']]
meta['jahrhundert_mitte'] = [(x//100)*100 if pd.notna(x) else float('NaN') for x in meta['zeit_mitte']]

Korpora¶

Korpora erstellen¶

In [4]:
meta_anth = (
    meta
    .query("corpus=='anth'")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
meta_anth_bin = binarize_meta(meta_anth)
In [5]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']

meta_modcanon = (
    meta
    .query("author in @modcanon_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
In [6]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']

meta_muench = (
    meta
    .query("author in @muench_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
In [7]:
sub_df = pd.DataFrame()
sub_names = ['Anthologien', 'Kanonisierte Moderne', 'Münchhausen-Kreis']
sub_metas = [meta_anth, meta_modcanon, meta_muench]

Merkmale berechnen¶

In [8]:
for this_name, this_meta in zip(sub_names, sub_metas):
    sub_df.loc[this_name, 'Jahr'] = round(this_meta['year'].mean(), 0)
    sub_df.loc[this_name, 'Texte'] = this_meta.shape[0]

    sub_df.loc[this_name, '1_zeitebene'] = this_meta.query("zeitebenen == 1").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, '2_zeitebene'] = this_meta.query("zeitebenen == 2").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, '3_zeitebene'] = this_meta.query("zeitebenen == 3").shape[0]/this_meta.shape[0]
    sub_df.loc[this_name, '4_zeitebene'] = this_meta.query("zeitebenen >= 4").shape[0]/this_meta.shape[0] 
    sub_df.loc[this_name, 'zeitebenen_per_text'] = this_meta['zeitebenen'].mean() 

    sub_df.loc[this_name, 'antike'] = this_meta.query("zeit_mitte <= 499").shape[0]/this_meta.shape[0] 
    sub_df.loc[this_name, 'mittelalter'] = this_meta.query("500 <= zeit_mitte <= 1499").shape[0]/this_meta.shape[0] 
    sub_df.loc[this_name, 'neuzeit'] = this_meta.query("zeit_mitte >= 1500").shape[0]/this_meta.shape[0] 
    
    sub_df.loc[this_name, 'fixierbarkeit'] = this_meta.query("fixierbarkeit == 1").shape[0]/this_meta.shape[0] 
    sub_df.loc[this_name, 'gegenwartsbezug'] = this_meta.query("gegenwartsbezug == 1").shape[0]/this_meta.shape[0] 
    sub_df.loc[this_name, 'gegenwartsdominanz'] = this_meta.query("vergangenheitsdominant == 0").shape[0]/this_meta.shape[0] 
In [9]:
round(sub_df, 2)
Out[9]:
Jahr Texte 1_zeitebene 2_zeitebene 3_zeitebene 4_zeitebene zeitebenen_per_text antike mittelalter neuzeit fixierbarkeit gegenwartsbezug gegenwartsdominanz
Anthologien 1875.0 1850.0 0.28 0.46 0.20 0.05 2.03 0.13 0.32 0.54 0.64 0.29 0.14
Kanonisierte Moderne 1903.0 113.0 0.47 0.44 0.07 0.02 1.64 0.26 0.21 0.53 0.45 0.21 0.12
Münchhausen-Kreis 1905.0 140.0 0.26 0.49 0.16 0.09 2.10 0.05 0.31 0.63 0.48 0.19 0.08

Zeitverlauf¶

In [10]:
ts = pd.DataFrame()
ts.index = pd.Series(range(1850, 1919), name = 'year')
In [11]:
ts['text_count'] = meta_anth.groupby('year').size()
ts['text_count'] = ts['text_count'].fillna(0)
ts['text_sum'] = smooth(ts['text_count'], mode = 'sum')
In [12]:
ts['1_zeitebene_count'] = [meta_anth.query("year == @x and zeitebenen == 1").shape[0] for x in ts.index]
ts['1_zeitebene_sum'] = smooth(ts['1_zeitebene_count'], mode = 'sum')
ts['1_zeitebene_share_smoothed'] = ts['1_zeitebene_sum']/ts['text_sum']

ts['2_zeitebene_count'] = [meta_anth.query("year == @x and zeitebenen == 2").shape[0] for x in ts.index]
ts['2_zeitebene_sum'] = smooth(ts['2_zeitebene_count'], mode = 'sum')
ts['2_zeitebene_share_smoothed'] = ts['2_zeitebene_sum']/ts['text_sum']

ts['3_zeitebene_count'] = [meta_anth.query("year == @x and zeitebenen == 3").shape[0] for x in ts.index]
ts['3_zeitebene_sum'] = smooth(ts['3_zeitebene_count'], mode = 'sum')
ts['3_zeitebene_share_smoothed'] = ts['3_zeitebene_sum']/ts['text_sum']

ts['4_zeitebene_count'] = [meta_anth.query("year == @x and zeitebenen >= 4").shape[0] for x in ts.index]
ts['4_zeitebene_sum'] = smooth(ts['4_zeitebene_count'], mode = 'sum')
ts['4_zeitebene_share_smoothed'] = ts['4_zeitebene_sum']/ts['text_sum']

ts['zeitebenen_count'] = [meta_anth.query("year == @x")['zeitebenen'].sum() for x in ts.index]
ts['zeitebenen_sum'] = smooth(ts['zeitebenen_count'], mode = 'sum')
ts['zeitebenen_per_text_smoothed'] = ts['zeitebenen_sum']/ts['text_sum']

ts['antike_count'] = [meta_anth.query("year == @x and zeit_mitte <= 499").shape[0] for x in ts.index]
ts['antike_sum'] = smooth(ts['antike_count'], mode = 'sum')
ts['antike_share_smoothed'] = ts['antike_sum']/ts['text_sum']

ts['mittelalter_count'] = [meta_anth.query("year == @x and 500 <= zeit_mitte <= 1499").shape[0] for x in ts.index]
ts['mittelalter_sum'] = smooth(ts['mittelalter_count'], mode = 'sum')
ts['mittelalter_share_smoothed'] = ts['mittelalter_sum']/ts['text_sum']

ts['neuzeit_count'] = [meta_anth.query("year == @x and zeit_mitte >= 1500").shape[0] for x in ts.index]
ts['neuzeit_sum'] = smooth(ts['neuzeit_count'], mode = 'sum')
ts['neuzeit_share_smoothed'] = ts['neuzeit_sum']/ts['text_sum']

ts['fixierbar_count'] = [meta_anth.query("year == @x and fixierbarkeit == 1").shape[0] for x in ts.index]
ts['fixierbar_sum'] = smooth(ts['fixierbar_count'], mode = 'sum')
ts['fixierbar_share_smoothed'] = ts['fixierbar_sum']/ts['text_sum']

ts['gegenwartsbezug_count'] = [meta_anth.query("year == @x and gegenwartsbezug == 1").shape[0] for x in ts.index]
ts['gegenwartsbezug_sum'] = smooth(ts['gegenwartsbezug_count'], mode = 'sum')
ts['gegenwartsbezug_share_smoothed'] = ts['gegenwartsbezug_sum']/ts['text_sum']

ts['gegenwartsdominanz_count'] = [meta_anth.query("year == @x and vergangenheitsdominant == 0").shape[0] for x in ts.index]
ts['gegenwartsdominanz_sum'] = smooth(ts['gegenwartsdominanz_count'], mode = 'sum')
ts['gegenwartsdominanz_share_smoothed'] = ts['gegenwartsdominanz_sum']/ts['text_sum']

Anzahl¶

In [13]:
meta_plot = ts[[
    '1_zeitebene_share_smoothed', 
    '2_zeitebene_share_smoothed', 
    '3_zeitebene_share_smoothed',
    '4_zeitebene_share_smoothed'
]]
meta_plot.columns = [
    '1 Zeitebene', 
    '2 Zeitebenen', 
    '3 Zeitebenen',
    '4 oder mehr Zeitebenen'
]
# save_ts_data(meta_plot, prefix='06_06_Anzahl_Zeitebenen_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['1_zeitebene', '2_zeitebene', '3_zeitebene', '4_zeitebene']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.6 Anzahl Zeitebenen im Zeitverlauf.pdf")
fig.show()
In [14]:
meta_plot = ts[['zeitebenen_per_text_smoothed', 
]]
meta_plot.columns = ['Anthologiekorpus']

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Zeitebenen pro Text (Mittelwert)',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['zeitebenen_per_text']
)
fig.show()
In [15]:
main_feature = 'zeitebenen'
In [16]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[16]:
zeitebenen                              1.000000
zeitmarker_vorhanden                    0.379343
gegenwartsbezug                         0.345756
liebe_positiv                           0.310905
marker_count                            0.294403
geschichtsauffassung_positiv            0.268621
ueberlieferung                          0.240563
sprechinstanz_nicht_in_vergangenheit    0.227481
words                                   0.218655
nationalismus                           0.208573
gegenwartsdominant                      0.205095
sprechinstanz_markiert                  0.172428
praesens_praeteritum_vorhanden          0.162247
nation_volk_d                           0.139624
sprechakte_count                        0.134375
stoffgebiet_count                       0.132739
zustand                                 0.127242
praeteritum_vorhanden                   0.117879
anachronismus                           0.117067
mittelraum_count                        0.104600
Name: zeitebenen, dtype: float64
In [17]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[17]:
wissen_ergaenzend                -0.155536
in_hohem_mass_konkret            -0.147513
konkretheit                      -0.144985
unbekanntes_individuum_negativ   -0.098230
liebe_negativ                    -0.097524
tod_negativ                      -0.069980
kollektiv_count                  -0.053933
tod_positiv                      -0.053372
entity_negativ                   -0.053127
religion                         -0.052141
ballade                          -0.046579
sprechinstanz_in_vergangenheit   -0.044282
unbekanntes_individuum_positiv   -0.042127
mittelalter                      -0.039734
ueberlieferung_negativ           -0.036360
kollektiv_negativ                -0.032346
krieg                            -0.029887
bekanntes_individuum_negativ     -0.026326
reim                             -0.025997
entity_ambivalent                -0.018668
Name: zeitebenen, dtype: float64
In [18]:
threshold = 0.2

bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [19]:
results = relations_contbin(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = bin_comp_features
)
In [20]:
directly_related = []
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("mannwhitneyu_p < 0.05 and (pointbiserialr_corr >= @threshold or pointbiserialr_corr <= -@threshold)")
    .sort_values(by = 'pointbiserialr_corr', ascending = False)
)
round(results_filtered, 2)
Out[20]:
wenn zeitebenen = 0: Anteil Texte mit Feature = ... wenn zeitebenen = 1: Anteil Texte mit Feature = ... wenn zeitebenen = 2: Anteil Texte mit Feature = ... wenn zeitebenen = 3: Anteil Texte mit Feature = ... wenn zeitebenen > 3: Anteil Texte mit Feature = ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p
zeitmarker_vorhanden 0 [0/0] 0.14638783269961977 [77/526] 0.45784543325526933 [391/854] 0.6385224274406333 [242/379] 0.7582417582417582 [69/91] 0.38 0.00 0.00 -0.82 240471.0 0.00
gegenwartsbezug 0 [0/0] 0.03802281368821293 [20/526] 0.3442622950819672 [294/854] 0.45910290237467016 [174/379] 0.5274725274725275 [48/91] 0.35 0.00 0.00 -0.82 202695.5 0.00
liebe_positiv 0 [0/0] 0.3333333333333333 [4/12] 0.42857142857142855 [12/28] 0.65 [13/20] 1.0 [3/3] 0.31 0.01 0.01 -0.66 333.5 0.02
geschichtsauffassung_positiv 0 [0/0] 0.0 [0/8] 0.3333333333333333 [13/39] 0.375 [9/24] 0.6666666666666666 [4/6] 0.27 0.02 0.02 -0.59 479.5 0.03
ueberlieferung 0 [0/0] 0.06653992395437262 [35/526] 0.2728337236533958 [233/854] 0.3403693931398417 [129/379] 0.3956043956043956 [36/91] 0.24 0.00 0.00 -0.59 208645.0 0.00
sprechinstanz_nicht_in_vergangenheit 0 [0/0] 0.09885931558935361 [52/526] 0.32786885245901637 [280/854] 0.37730870712401055 [143/379] 0.4175824175824176 [38/91] 0.23 0.00 0.00 -0.53 243926.0 0.00
nationalismus 0 [0/0] 0.053231939163498096 [28/526] 0.13114754098360656 [112/854] 0.21899736147757257 [83/379] 0.32967032967032966 [30/91] 0.21 0.00 0.00 -0.60 137540.5 0.00
gegenwartsdominant 0 [0/0] 0.02091254752851711 [11/526] 0.17096018735362997 [146/854] 0.23746701846965698 [90/379] 0.21978021978021978 [20/91] 0.21 0.00 0.00 -0.63 138975.5 0.00
In [21]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='pointbiserialr_corr', ascending=False), 2)
Out[21]:
wenn zeitebenen = 0: Anteil Texte mit Feature = ... wenn zeitebenen = 1: Anteil Texte mit Feature = ... wenn zeitebenen = 2: Anteil Texte mit Feature = ... wenn zeitebenen = 3: Anteil Texte mit Feature = ... wenn zeitebenen > 3: Anteil Texte mit Feature = ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p
stoffgebiet_positiv 0 [0/0] 0.42015209125475284 [221/526] 0.4519906323185012 [386/854] 0.43799472295514513 [166/379] 0.46153846153846156 [42/91] 0.09 0.00 0.04 -0.10 328983.0 0.04
entity_positiv 0 [0/0] 0.43155893536121676 [227/526] 0.4613583138173302 [394/854] 0.41160949868073876 [156/379] 0.5384615384615384 [49/91] 0.07 0.00 0.09 -0.09 214392.5 0.11
bekanntes_individuum_positiv 0 [0/0] 0.6544502617801047 [250/382] 0.6971153846153846 [435/624] 0.6920415224913494 [200/289] 0.8055555555555556 [58/72] 0.06 0.03 0.03 -0.13 187505.0 0.05
kollektiv_positiv 0 [0/0] 0.3925619834710744 [95/242] 0.44110275689223055 [176/399] 0.47530864197530864 [77/162] 0.5 [13/26] 0.06 0.09 0.09 -0.12 78714.0 0.07
stoffgebiet_ambivalent 0 [0/0] 0.16920152091254753 [89/526] 0.1405152224824356 [120/854] 0.19261213720316622 [73/379] 0.17582417582417584 [16/91] 0.02 0.41 0.39 -0.05 222628.5 0.47
stoffgebiet_negativ 0 [0/0] 0.2224334600760456 [117/526] 0.22131147540983606 [189/854] 0.2532981530343008 [96/379] 0.18681318681318682 [17/91] 0.02 0.48 0.81 -0.01 282618.5 0.56
entity_neutral 0 [0/0] 0.2737642585551331 [144/526] 0.2962529274004684 [253/854] 0.26649076517150394 [101/379] 0.21978021978021978 [20/91] -0.01 0.69 0.30 0.06 277815.0 0.48
stoffgebiet_neutral 0 [0/0] 0.19771863117870722 [104/526] 0.1721311475409836 [147/854] 0.158311345646438 [60/379] 0.14285714285714285 [13/91] -0.02 0.50 0.14 0.09 248103.0 0.09
entity_ambivalent 0 [0/0] 0.11026615969581749 [58/526] 0.10889929742388758 [93/854] 0.09762532981530343 [37/379] 0.04395604395604396 [4/91] -0.02 0.42 0.11 0.13 164914.0 0.21
bekanntes_individuum_negativ 0 [0/0] 0.21727748691099477 [83/382] 0.16666666666666666 [104/624] 0.18339100346020762 [53/289] 0.19444444444444445 [14/72] -0.03 0.33 0.33 0.07 147334.0 0.26
kollektiv_negativ 0 [0/0] 0.3512396694214876 [85/242] 0.2857142857142857 [114/399] 0.3395061728395062 [55/162] 0.23076923076923078 [6/26] -0.03 0.35 0.35 0.07 76521.0 0.39
unbekanntes_individuum_positiv 0 [0/0] 0.4589041095890411 [67/146] 0.4076086956521739 [75/184] 0.345679012345679 [28/81] 0.5217391304347826 [12/23] -0.04 0.38 0.38 0.09 24247.0 0.28
entity_negativ 0 [0/0] 0.30038022813688214 [158/526] 0.24004683840749413 [205/854] 0.2691292875989446 [102/379] 0.1978021978021978 [18/91] -0.05 0.02 0.04 0.11 329306.0 0.04
unbekanntes_individuum_negativ 0 [0/0] 0.23972602739726026 [35/146] 0.14130434782608695 [26/184] 0.12345679012345678 [10/81] 0.17391304347826086 [4/23] -0.10 0.04 0.04 0.26 15601.5 0.02
In [22]:
result_categories = ['pointbiserialr_corr', 'mannwhitneyu_p']

results_a = relations_contbin(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_contbin(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']

round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[22]:
pointbiserialr_corr_1850 mannwhitneyu_p_1850 pointbiserialr_corr_1885 mannwhitneyu_p_1885 diff_of_corrs
liebe_positiv 0.400 0.007 0.052 0.951 -0.348
gegenwartsbezug 0.368 0.000 0.291 0.000 -0.077
sprechinstanz_nicht_in_vergangenheit 0.240 0.000 0.196 0.000 -0.043
gegenwartsdominant 0.217 0.000 0.179 0.000 -0.037
nationalismus 0.208 0.000 0.210 0.000 0.002
zeitmarker_vorhanden 0.375 0.000 0.391 0.000 0.015
ueberlieferung 0.221 0.000 0.289 0.000 0.068
geschichtsauffassung_positiv 0.227 0.083 0.343 0.234 0.116
In [23]:
results = relations_contbin_ratings(meta_anth_bin, main_feature)
results.sort_values(by = 'pointbiserialr_corr')
  0%|          | 0/14 [00:00<?, ?it/s]
Out[23]:
wenn zeitebenen = 1: Anteil mit Feature = ... wenn zeitebenen = 2: Anteil mit Feature = ... wenn zeitebenen = 3: Anteil mit Feature = ... wenn zeitebenen = 4: Anteil mit Feature = ... pointbiserialr_corr pointbiserialr_p mannwhitneyu_stat mannwhitneyu_p
unbekanntes_individuum_negativ 0.1912 [39/204] 0.1019 [27/265] 0.0887 [11/124] 0.1176 [4/34] -0.106249 0.007558 18064.0 0.003468
unbekanntes_individuum_positiv 0.3775 [77/204] 0.3434 [91/265] 0.2984 [37/124] 0.3529 [12/34] -0.063375 0.111744 41898.0 0.139787
entity_negativ 0.1951 [221/1133] 0.1506 [264/1753] 0.1545 [129/835] 0.1629 [29/178] -0.040320 0.011624 988881.5 0.008402
stoffgebiet_neutral 0.203 [135/665] 0.2015 [239/1186] 0.1595 [89/558] 0.1186 [14/118] -0.039959 0.044041 466177.5 0.027453
kollektiv_negativ 0.2767 [88/318] 0.2349 [117/498] 0.2639 [57/216] 0.2 [6/30] -0.028162 0.358762 103369.0 0.414886
bekanntes_individuum_negativ 0.1554 [90/579] 0.1282 [114/889] 0.1264 [56/443] 0.1442 [15/104] -0.025073 0.259428 230722.0 0.209363
entity_ambivalent 0.0662 [75/1133] 0.061 [107/1753] 0.0647 [54/835] 0.0449 [8/178] -0.015126 0.343998 436605.0 0.477226
entity_neutral 0.2992 [339/1133] 0.3006 [527/1753] 0.2862 [239/835] 0.2528 [45/178] -0.011455 0.473606 1570087.5 0.389664
stoffgebiet_negativ 0.2211 [147/665] 0.2057 [244/1186] 0.2079 [116/558] 0.2458 [29/118] -0.010792 0.586695 530833.5 0.657359
stoffgebiet_ambivalent 0.1489 [99/665] 0.1164 [138/1186] 0.1523 [85/558] 0.1186 [14/118] -0.002756 0.889572 370070.0 0.859813
stoffgebiet_positiv 0.4271 [284/665] 0.4764 [565/1186] 0.4803 [268/558] 0.5169 [61/118] 0.042113 0.033810 840859.0 0.026669
entity_positiv 0.4395 [498/1133] 0.4877 [855/1753] 0.4946 [413/835] 0.5393 [96/178] 0.047704 0.002827 2015173.0 0.002032
bekanntes_individuum_positiv 0.5233 [303/579] 0.5838 [519/889] 0.5801 [257/443] 0.625 [65/104] 0.058191 0.008813 533321.5 0.012740
kollektiv_positiv 0.327 [104/318] 0.3795 [189/498] 0.412 [89/216] 0.4667 [14/30] 0.062169 0.042614 142185.0 0.027470
In [24]:
results = relations_contcont(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = cont_comp_features
)
In [25]:
round(results, 5)
Out[25]:
wenn zeitebenen = 0: Mittelwert Feature = ... wenn zeitebenen = 1: Mittelwert Feature = ... wenn zeitebenen = 2: Mittelwert Feature = ... wenn zeitebenen = 3: Mittelwert Feature = ... wenn zeitebenen > 3: Mittelwert Feature = ... pearsonr_corr pearsonr_p
marker_count NaN 1.74335 2.16511 2.40897 2.72527 0.29440 0.0
words NaN 268.76220 298.44823 369.11549 469.10714 0.21865 0.0
In [26]:
meta_plot = meta_anth_bin.copy()
meta_plot['zeitebenen'] = meta_plot['zeitebenen'].clip(upper=4)
meta_plot['zeitebenen'] = meta_plot['zeitebenen'].replace({1:'1', 2:'2', 3:'3', 4:'>=4'})
meta_plot = meta_plot.sort_values(by='zeitebenen')
meta_plot['words'] = meta_plot['words'].clip(upper=1250)

for cont_comp_feature in cont_comp_features:
    fig = px.box(
        meta_plot,
        x = main_feature,
        y = cont_comp_feature,
        labels = {'zeitebenen' : 'Anzahl Zeitebenen',
                  'marker_count' : 'Anzahl Geschichtsmarker-Typen',
                  'words' : 'Anzahl Wörter',
                 },
        # color_discrete_sequence=['grey']
    )
    if cont_comp_feature != 'words':
        fig.update_traces(boxmean=True)
    fig.update_layout(
        width = 700, height = 300,
        xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
        yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
        legend=dict(font = dict(size=16), x=0.61, y = 0.88),
        bargap=0.1
    )
    fig = update_fig_for_publication(fig, make_grey=True)
    fig.write_image(f"plots/6.6 Zeitebenen – {cont_comp_feature}.pdf")
    fig.show()
In [27]:
result_categories = ['pearsonr_corr', 'pearsonr_p']

results_a = relations_contcont(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = cont_comp_features
)

results_b = relations_contcont(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = cont_comp_features
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pearsonr_corr_1885'] - results_merged['pearsonr_corr_1850']

round(results_merged.sort_values(by = 'diff_of_corrs'), 5)
Out[27]:
pearsonr_corr_1850 pearsonr_p_1850 pearsonr_corr_1885 pearsonr_p_1885 diff_of_corrs
marker_count 0.29923 0.0 0.28519 0.0 -0.01404
words 0.18990 0.0 0.31380 0.0 0.12390
In [28]:
results = relations_binbin(
    meta = meta_anth_bin, 
    main_feature = 'nationalismus',
    comp_features = ['gegenwartsbezug']
)

results.T
Out[28]:
gegenwartsbezug
wenn_nicht 0.222918
wenn_nicht_detail 356/1597
wenn_ja 0.711462
wenn_ja_detail 180/253
diff_low_bootstrap 0.431462
diff_low 0.4291
diff 0.488544
diff_high 0.547989
diff_high_bootstrap 0.552505
chi2 253.305857
chi2_p 0.0
fisher_p 0.0
phi 0.37003
min_real 73.0
min_expected 73.301622

Behandelte Zeiten¶

In [29]:
meta_plot = meta_anth.copy()
meta_plot['jahrhundert_mitte'] = meta_plot['jahrhundert_mitte'].clip(-500)

fig = px.histogram(
    meta_plot,
    x = 'jahrhundert_mitte',
    labels = {'jahrhundert_mitte' : 'Jahrhundert'},
    # color_discrete_sequence=['grey']
)
fig.update_layout(
    width=900, height=500,
    xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16), range=[-600, 2100]),
    yaxis=dict(title='Anzahl Texte', tickfont=dict(size=16), titlefont=dict(size=16)),
    legend=dict(font = dict(size=16), traceorder = 'normal'),
    showlegend=False
)
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.6 Behandelte Jahrhunderte.pdf")
fig.show()
In [30]:
meta_anth.query("zeit_mitte <= 1000").shape[0]
Out[30]:
432
In [31]:
meta_anth['dekade_mitte'].value_counts().head(10)
Out[31]:
dekade_mitte
1870.0    141
1810.0     72
1520.0     68
1860.0     58
1630.0     46
1800.0     45
1750.0     43
1850.0     42
1510.0     40
1880.0     40
Name: count, dtype: int64
In [32]:
meta_plot_1850 = meta_anth.query("1850<=year<=1859").copy()
meta_plot_1850['corpus'] = '1850er'
meta_plot_1860 = meta_anth.query("1860<=year<=1869").copy()
meta_plot_1860['corpus'] = '1860er'
meta_plot_1870 = meta_anth.query("1870<=year<=1879").copy()
meta_plot_1870['corpus'] = '1870er'
meta_plot_1880 = meta_anth.query("1880<=year<=1889").copy()
meta_plot_1880['corpus'] = '1880er'
meta_plot_1890 = meta_anth.query("1890<=year<=1899").copy()
meta_plot_1890['corpus'] = '1890er'
meta_plot_1900 = meta_anth.query("1900<=year<=1909").copy()
meta_plot_1900['corpus'] = '1900er'
meta_plot_1910 = meta_anth.query("1910<=year<=1918").copy()
meta_plot_1910['corpus'] = '1910er'
meta_modcanon['corpus'] = 'Kanonisierte Moderne'
meta_muench['corpus'] = 'Münchhausen-Kreis'

meta_plot = pd.concat([
    meta_plot_1850, meta_plot_1860, meta_plot_1870, meta_plot_1880,
    meta_plot_1890, meta_plot_1900, meta_plot_1910, meta_modcanon, meta_muench
])
meta_plot['zeit_mitte'] = meta_plot['zeit_mitte'].clip(0)

fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'zeit_mitte',
    points = 'all',
    labels = {'corpus' : '', 'zeit_mitte' : 'Dominante Zeitebene'},
    # color_discrete_sequence=['grey']
)
fig.update_layout(
    width=900, height=500,
    xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
    yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
    legend=dict(font = dict(size=16), traceorder = 'normal'),
    showlegend=False
)
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.6 Behandelte Zeiten im Zeitverlauf.pdf")
fig.show()
In [33]:
meta_plot = ts[[
    'antike_share_smoothed', 
    'mittelalter_share_smoothed', 
    'neuzeit_share_smoothed',
]]
meta_plot.columns = [
    'vor 500', 
    '500 bis 1499', 
    'ab 1500',
]
save_ts_data(meta_plot, prefix='06_06_Behandelte_Epochen_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['antike', 'mittelalter', 'neuzeit']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.6 Behandelte Epochen im Zeitverlauf.pdf")
fig.show()
In [34]:
for Dekade in meta_anth['decade'].unique():
    jahrhundert_vc = meta_anth.query("decade == @Dekade")['jahrhundert_mitte'].value_counts()
    print(f"{int(Dekade)}er:")
    print(jahrhundert_vc.head(2))
    print("\n")
1850er:
jahrhundert_mitte
1500.0    79
1800.0    61
Name: count, dtype: int64


1860er:
jahrhundert_mitte
1800.0    72
1700.0    44
Name: count, dtype: int64


1870er:
jahrhundert_mitte
1800.0    142
1500.0     39
Name: count, dtype: int64


1880er:
jahrhundert_mitte
1800.0    71
1200.0    21
Name: count, dtype: int64


1890er:
jahrhundert_mitte
1800.0    62
1600.0    12
Name: count, dtype: int64


1900er:
jahrhundert_mitte
1800.0    31
1500.0    22
Name: count, dtype: int64


1910er:
jahrhundert_mitte
1800.0    26
1900.0    22
Name: count, dtype: int64


In [35]:
meta_anth.query("vergangenheitsdominant == 0").shape[0]
Out[35]:
267
In [36]:
meta_anth.query("vergangenheitsdominant != 0")['jahrhundert_mitte'].value_counts().head(5)
Out[36]:
jahrhundert_mitte
1800.0    230
1500.0    208
1700.0    165
1600.0    130
1200.0    116
Name: count, dtype: int64
In [37]:
meta_plot = ts[['fixierbar_share_smoothed']]
meta_plot.columns = ['Fixierbarkeit']
# save_ts_data(meta_plot, prefix='06_06_Fixierbarkeit_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['fixierbarkeit']
)
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.6 Fixierbarkeit der dominanten Zeitebene im Zeitverlauf.pdf")
fig.show()
In [38]:
# Texte 1870/71
meta_1870 = meta_anth.query("1870 <= year <= 1871").copy()
meta_1870['krieg_gegenwart'] = [1 if 'Krieg' in x and y >= 1870 else 0 for x, y in zip(meta_1870['stoffgebiet'], meta_1870['zeit_mitte'])]
contingency_table = pd.crosstab(meta_1870['fixierbarkeit'], meta_1870['krieg_gegenwart'])

print(contingency_table)
print("\n")
print(f"chi2   : {chi2_contingency(contingency_table)[0]}")
print(f"chi2 p : {chi2_contingency(contingency_table)[1]}")
print(f"phi    : {get_phi(np.array(contingency_table))}")
krieg_gegenwart   0   1
fixierbarkeit          
0.0              17   3
1.0              43  46


chi2   : 7.460835674157303
chi2 p : 0.006305572769125118
phi    : 0.2854496500496102
In [39]:
# Texte 1914/18
meta_1914 = meta_anth.query("1914 <= year <= 1918").copy()
meta_1914['krieg_gegenwart'] = [1 if 'Krieg' in x and y >= 1914 else 0 for x, y in zip(meta_1914['stoffgebiet'], meta_1914['zeit_mitte'])]
contingency_table = pd.crosstab(meta_1914['fixierbarkeit'], meta_1914['krieg_gegenwart'])

print(contingency_table)
print("\n")
print(f"chi2   : {chi2_contingency(contingency_table)[0]}")
print(f"chi2 p : {chi2_contingency(contingency_table)[1]}")
print(f"phi    : {get_phi(np.array(contingency_table))}")
krieg_gegenwart   0   1
fixierbarkeit          
0.0              16   7
1.0              17  10


chi2   : 0.03674138371495582
chi2 p : 0.8479925435404684
phi    : 0.06946349424835088
In [40]:
meta_anth_bin['period'] = [1 if year >= 1885 else 0 for year in meta_anth_bin['year']]

results = relations_binbin(
    meta = meta_anth_bin, 
    main_feature = 'period',
    comp_features = ['verfremdung']
)

round(results, 3)
Out[40]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
verfremdung 0.004 5/1346 0.02 10/504 0.004 0.004 0.016 0.029 0.028 11.858 0.001 0.002 0.08 5.0 4.086
In [41]:
for corpus in [meta_anth, meta_muench, meta_modcanon]:
    print(corpus['verfremdung'].value_counts())
    print("\n")
verfremdung
0.0    1835
0.5      12
1.0       3
Name: count, dtype: int64


verfremdung
0.0    137
0.5      3
Name: count, dtype: int64


verfremdung
0.0    78
0.5    25
1.0    10
Name: count, dtype: int64


Gegenwartsbezug¶

In [42]:
meta_plot = ts[[
    'gegenwartsbezug_share_smoothed', 
    'gegenwartsdominanz_share_smoothed', 
]]
meta_plot.columns = [
    'Gegenwartsbezug', 
    'Gegenwartsdominanz', 
]
# save_ts_data(meta_plot, prefix='06_06_Gegenwartsbezug_')

fig = create_ts_plot(
    data = meta_plot, 
    columns = meta_plot.columns, 
    y_axis_title = 'Anteil an Texten',
    add_corporas = sub_df, add_corpora_names = sub_names,
    add_corpora_categories = ['gegenwartsbezug', 'gegenwartsdominanz']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.6 Gegenwartsbezug und Gegenwartsdominanz im Zeitverlauf.pdf")
fig.show()
In [43]:
# Texte 1870/71
meta_1870 = meta_anth_bin.query("1870 <= year <= 1871").copy()
contingency_table = pd.crosstab(meta_1870['krieg'], meta_1870['gegenwartsbezug'])

print(contingency_table)
print("\n")
print(f"chi2   : {chi2_contingency(contingency_table)[0]}")
print(f"chi2 p : {chi2_contingency(contingency_table)[1]}")
print(f"phi    : {get_phi(np.array(contingency_table))}")
gegenwartsbezug  0.0  1.0
krieg                    
0                 16   26
1                 12   55


chi2   : 4.503397169168017
chi2 p : 0.0338275856526984
phi    : 0.22483543766235536
In [44]:
# Texte 1888/98
meta_1888 = meta_anth_bin.query("1888 <= year <= 1898").copy()
contingency_table = pd.crosstab(meta_1888['tod'], meta_1888['gegenwartsbezug'])

print(contingency_table)
print("\n")
print(f"chi2   : {chi2_contingency(contingency_table)[0]}")
print(f"chi2 p : {chi2_contingency(contingency_table)[1]}")
print(f"phi    : {get_phi(np.array(contingency_table))}")
gegenwartsbezug  0.0  1.0
tod                      
0                110   49
1                 27   20


chi2   : 1.746966196683179
chi2 p : 0.18625857869318346
phi    : 0.1043439500305574
In [45]:
# Texte 1914/18
meta_1914 = meta_anth_bin.query("1914 <= year <= 1918").copy()
contingency_table = pd.crosstab(meta_1914['krieg'], meta_1914['gegenwartsbezug'])

print(contingency_table)
print("\n")
print(f"fisher   : {fisher_exact(contingency_table)[0]}")
print(f"fisher p : {fisher_exact(contingency_table)[1]}")
print(f"phi      : {get_phi(np.array(contingency_table))}")
gegenwartsbezug  0.0  1.0
krieg                    
0                 14    7
1                 11   18


fisher   : 3.272727272727273
fisher p : 0.08451099837490807
phi      : 0.2836543144655877
In [46]:
main_feature = 'gegenwartsbezug'
In [47]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(30)
Out[47]:
gegenwartsbezug                         1.000000
gegenwartsdominant                      0.626075
sprechinstanz_nicht_in_vergangenheit    0.583905
ueberlieferung                          0.460262
wissen_identisch                        0.403737
nationalismus                           0.370030
zeitebenen                              0.345756
sprechinstanz_markiert                  0.331470
zeitmarker_vorhanden                    0.331360
ende                                    0.323848
dekade_mitte                            0.314006
zeit_mitte                              0.313603
neuzeit                                 0.308422
jahrhundert_mitte                       0.305689
denkmal                                 0.305105
nogenre                                 0.303092
beginn                                  0.296779
zustand                                 0.286530
liebe_positiv                           0.280020
anachronismus                           0.278666
sprechakt_behaupten_vorhanden           0.269346
krieg_positiv                           0.257633
sprechakte_count                        0.239705
stoffgebiet_positiv                     0.212390
nation_volk_d_positiv                   0.209529
ueberlieferung_positiv                  0.201391
nichtmensch_count                       0.198946
geschichtsauffassung_positiv            0.192482
politik_positiv                         0.184400
kollektiv_positiv                       0.184005
Name: gegenwartsbezug, dtype: float64
In [48]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[48]:
wissen_ergaenzend                -0.421585
konkretheit                      -0.313873
ballade                          -0.312448
in_hohem_mass_konkret            -0.306742
sprechinstanz_in_vergangenheit   -0.263657
sprechakt_erzaehlen_vorhanden    -0.240965
mittelalter                      -0.209565
ereignis                         -0.205842
antike                           -0.180628
rollengedicht                    -0.173510
bekanntes_individuum_count       -0.148460
entity_negativ                   -0.146107
stoffgebiet_negativ              -0.144972
objektmarker_vorhanden           -0.144449
kollektiv_negativ                -0.143118
liebe_negativ                    -0.134840
unbekanntes_individuum_negativ   -0.133431
krieg_negativ                    -0.121866
religion                         -0.109717
entity_neutral                   -0.108622
Name: gegenwartsbezug, dtype: float64
In [49]:
threshold = 0.3

bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
bin_comp_features = bin_comp_features + ['in_hohem_mass_konkret']
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [50]:
results = relations_binbin(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = bin_comp_features
)
In [51]:
directly_related = [
    'sprechinstanz_nicht_in_vergangenheit', 'gegenwartsdominant', 'neuzeit',
    'nogenre', # Zusammenhang mit Ballade + Denkmal
    'wissen_ergaenzend', # Zusammenhang mit wissen_identisch
]
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
    .sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[51]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
ueberlieferung 0.11 144/1314 0.54 289/536 0.38 0.38 0.43 0.48 0.48 391.91 0.0 0.0 0.46 144.0 125.45
sprechinstanz_markiert 0.33 438/1314 0.70 373/536 0.32 0.32 0.36 0.41 0.41 203.26 0.0 0.0 0.33 163.0 234.97
zeitmarker_vorhanden 0.32 416/1314 0.68 363/536 0.31 0.31 0.36 0.41 0.40 203.13 0.0 0.0 0.33 173.0 225.70
wissen_identisch 0.05 69/1314 0.36 195/536 0.27 0.27 0.31 0.35 0.36 301.56 0.0 0.0 0.40 69.0 76.49
nationalismus 0.06 73/1314 0.34 180/536 0.24 0.24 0.28 0.32 0.32 253.31 0.0 0.0 0.37 73.0 73.30
denkmal 0.00 2/1314 0.13 71/536 0.10 0.10 0.13 0.16 0.16 172.21 0.0 0.0 0.31 2.0 21.15
in_hohem_mass_konkret 0.89 1174/1314 0.63 339/536 -0.31 -0.31 -0.26 -0.22 -0.22 174.07 0.0 0.0 0.31 140.0 97.64
ballade 0.66 866/1314 0.32 170/536 -0.39 -0.39 -0.34 -0.29 -0.29 180.60 0.0 0.0 0.31 170.0 235.84
In [52]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 2)
Out[52]:
wenn_nicht wenn_nicht_detail wenn_ja wenn_ja_detail diff_low_bootstrap diff_low diff diff_high diff_high_bootstrap chi2 chi2_p fisher_p phi min_real min_expected
sprechinstanz_nicht_in_vergangenheit 0.11 145/1314 0.69 368/536 0.53 0.53 0.58 0.62 0.62 630.75 0.00 0.00 0.58 145.0 148.63
gegenwartsdominant 0.00 5/1314 0.49 262/536 0.44 0.44 0.49 0.53 0.52 725.14 0.00 0.00 0.63 5.0 77.36
neuzeit 0.44 582/1314 0.78 419/536 0.29 0.29 0.34 0.38 0.38 175.98 0.00 0.00 0.31 117.0 245.98
nogenre 0.15 193/1314 0.43 229/536 0.23 0.23 0.28 0.33 0.33 169.95 0.00 0.00 0.30 193.0 122.27
stoffgebiet_positiv 0.40 723/1809 0.63 461/731 0.19 0.19 0.23 0.27 0.27 111.61 0.00 0.00 0.21 270.0 340.75
bekanntes_individuum_positiv 0.52 808/1546 0.72 344/479 0.15 0.15 0.20 0.24 0.24 57.00 0.00 0.00 0.17 135.0 206.50
kollektiv_positiv 0.32 232/734 0.50 164/330 0.12 0.12 0.18 0.24 0.25 31.88 0.00 0.00 0.17 164.0 122.82
entity_positiv 0.43 1237/2860 0.60 634/1056 0.13 0.13 0.17 0.20 0.20 87.10 0.00 0.00 0.15 422.0 504.54
unbekanntes_individuum_positiv 0.32 160/498 0.43 57/133 0.01 0.01 0.11 0.20 0.21 5.35 0.02 0.02 0.09 57.0 45.74
entity_ambivalent 0.07 197/2860 0.04 47/1056 -0.04 -0.04 -0.02 -0.01 -0.01 7.84 0.01 0.00 0.04 47.0 65.80
stoffgebiet_ambivalent 0.14 254/1809 0.11 84/731 -0.05 -0.05 -0.03 0.00 0.00 2.93 0.09 0.09 0.03 84.0 97.27
bekanntes_individuum_negativ 0.15 231/1546 0.09 45/479 -0.09 -0.09 -0.06 -0.02 -0.02 9.56 0.00 0.00 0.07 45.0 65.29
entity_neutral 0.31 899/2860 0.24 258/1056 -0.10 -0.10 -0.07 -0.04 -0.04 18.16 0.00 0.00 0.07 258.0 312.00
entity_negativ 0.18 527/2860 0.11 117/1056 -0.10 -0.10 -0.07 -0.05 -0.05 30.30 0.00 0.00 0.09 117.0 173.66
stoffgebiet_neutral 0.21 388/1809 0.13 94/731 -0.12 -0.12 -0.09 -0.06 -0.05 24.98 0.00 0.00 0.10 94.0 138.72
unbekanntes_individuum_negativ 0.15 75/498 0.05 6/133 -0.15 -0.15 -0.11 -0.06 -0.06 10.44 0.00 0.00 0.13 6.0 17.07
kollektiv_negativ 0.29 211/734 0.17 57/330 -0.16 -0.17 -0.11 -0.06 -0.06 15.90 0.00 0.00 0.12 57.0 83.12
stoffgebiet_negativ 0.25 444/1809 0.13 92/731 -0.15 -0.15 -0.12 -0.09 -0.09 44.72 0.00 0.00 0.13 92.0 154.26
wissen_ergaenzend 0.86 1136/1314 0.46 247/536 -0.45 -0.45 -0.40 -0.36 -0.36 328.81 0.00 0.00 0.42 178.0 135.30
In [53]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]

results_a = relations_binbin(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_binbin(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']

round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[53]:
wenn_nicht_1850 wenn_nicht_detail_1850 wenn_ja_1850 wenn_ja_detail_1850 diff_1850 chi2_p_1850 phi_1850 wenn_nicht_1885 wenn_nicht_detail_1885 wenn_ja_1885 wenn_ja_detail_1885 diff_1885 chi2_p_1885 phi_1885 diff_of_diffs diff_of_phis
sprechinstanz_markiert 0.328 313/955 0.708 277/391 0.381 0.0 0.348 0.348 125/359 0.662 96/145 0.314 0.0 0.286 -0.067 -0.062
ballade 0.704 672/955 0.358 140/391 -0.346 0.0 0.321 0.540 194/359 0.207 30/145 -0.333 0.0 0.304 0.012 -0.017
zeitmarker_vorhanden 0.309 295/955 0.673 263/391 0.364 0.0 0.335 0.337 121/359 0.690 100/145 0.353 0.0 0.322 -0.011 -0.014
nationalismus 0.053 51/955 0.332 130/391 0.279 0.0 0.371 0.061 22/359 0.345 50/145 0.284 0.0 0.367 0.004 -0.005
ueberlieferung 0.109 104/955 0.532 208/391 0.423 0.0 0.455 0.111 40/359 0.559 81/145 0.447 0.0 0.474 0.024 0.019
in_hohem_mass_konkret 0.895 855/955 0.650 254/391 -0.246 0.0 0.293 0.889 319/359 0.586 85/145 -0.302 0.0 0.343 -0.057 0.050
denkmal 0.002 2/955 0.123 48/391 0.121 0.0 0.290 0.000 0/359 0.159 23/145 0.159 0.0 0.344 0.038 0.054
wissen_identisch 0.050 48/955 0.327 128/391 0.277 0.0 0.373 0.058 21/359 0.462 67/145 0.404 0.0 0.481 0.126 0.108
In [54]:
results = relations_bincont(
    meta = meta_anth_bin, 
    main_feature = main_feature,
    comp_features = cont_comp_features
)
In [55]:
directly_related = [
    'ende', 'dekade_mitte', 'zeit_mitte', 'jahrhundert_mitte',
    'konkretheit' # siehe binäres Merkmal
]
results_filtered = (
    results
    .query("index not in @directly_related")
    .query("mannwhitneyu_p < 0.05 and (pointbiserialr_corr >= @threshold or pointbiserialr_corr <= -@threshold)")
    .sort_values(by = 'pointbiserialr_corr', ascending = False)
)
round(results_filtered, 2)
Out[55]:
wenn_nicht a_merkmal=0 a_merkmal=1 a_merkmal=2 a_merkmal=3 a_merkmal>=4 wenn_ja b_merkmal=0 b_merkmal=1 b_merkmal=2 ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p meandiffs_ci_lower meandiffs_ci_bootstrap_lower meandiffs_ci_upper meandiffs_ci_bootstrap_upper
zeitebenen 1.84 0.0 [0/1314] 0.39 [506/1314] 0.43 [560/1314] 0.16 [205/1314] 0.03 [43/1314] 2.49 0.0 [0/536] 0.04 [20/536] 0.55 [294/536] ... 0.35 0.0 0.0 -0.82 202695.5 0.0 0.57 0.57 0.73 0.73

1 rows × 22 columns

In [56]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='pointbiserialr_corr', ascending=False), 2)
Out[56]:
wenn_nicht a_merkmal=0 a_merkmal=1 a_merkmal=2 a_merkmal=3 a_merkmal>=4 wenn_ja b_merkmal=0 b_merkmal=1 b_merkmal=2 ... pointbiserialr_corr pointbiserialr_p ttest_p cohens_d mannwhitneyu_stat mannwhitneyu_p meandiffs_ci_lower meandiffs_ci_bootstrap_lower meandiffs_ci_upper meandiffs_ci_bootstrap_upper
ende 1171.66 0.0 [4/1312] 0.0 [0/1312] 0.0 [0/1312] 0.0 [0/1312] 0.91 [1189/1312] 1655.33 0.0 [1/530] 0.0 [0/530] 0.0 [0/530] ... 0.32 0.0 0.0 -0.82 126079.0 0.0 419.06 428.21 548.28 536.90
dekade_mitte 1158.13 0.01 [10/1312] 0.0 [0/1312] 0.0 [0/1312] 0.0 [0/1312] 0.9 [1182/1312] 1631.36 0.0 [2/530] 0.0 [0/530] 0.0 [0/530] ... 0.31 0.0 0.0 -0.79 131773.0 0.0 407.81 422.99 538.65 527.47
zeit_mitte 1161.92 0.0 [5/1312] 0.0 [0/1312] 0.0 [0/1312] 0.0 [0/1312] 0.9 [1187/1312] 1634.65 0.0 [2/530] 0.0 [0/530] 0.0 [0/530] ... 0.31 0.0 0.0 -0.79 131463.0 0.0 407.28 416.46 538.17 525.27
jahrhundert_mitte 1120.58 0.03 [38/1312] 0.0 [0/1312] 0.0 [0/1312] 0.0 [0/1312] 0.88 [1154/1312] 1579.25 0.01 [4/530] 0.0 [0/530] 0.0 [0/530] ... 0.31 0.0 0.0 -0.77 149478.5 0.0 393.35 404.68 523.98 515.55
konkretheit 0.95 0.0 [4/1314] 0.89 [1174/1314] 0.0 [0/1314] 0.0 [0/1314] 0.0 [0/1314] 0.80 0.04 [21/536] 0.63 [339/536] 0.0 [0/536] ... -0.31 0.0 0.0 0.65 445137.0 0.0 -0.17 -0.17 -0.13 -0.12

5 rows × 22 columns

In [57]:
meta_plot = meta_anth_bin.copy()

for cont_comp_feature in results_filtered.index:
    mean_main = meta_plot[meta_plot[main_feature] == 1][cont_comp_feature].mean()
    mean_notmain = meta_plot[meta_plot[main_feature] == 0][cont_comp_feature].mean()
    label_main = f"Gegenwartsbezug<br>(Mittelwert = {round(mean_main, 2)})"
    label_notmain = f"Kein Gegenwartsbezug<br>(Mittelwert = {round(mean_notmain, 2)})"
    meta_plot['plot_legend'] = [label_main if x == 1 else label_notmain for x in meta_plot[main_feature]]
        
    fig = px.histogram(
        meta_plot,
        x = cont_comp_feature,
        color = 'plot_legend',
        histnorm = 'probability density',
        barmode = 'group',
        labels = {'plot_legend' : '', 'zeitebenen' : 'Anzahl Zeitebenen',}
    )

    fig.update_layout(
        width = 700, height = 300,
        yaxis_title="Anteil",
        xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
        yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
        legend=dict(font = dict(size=16), x=0.54, y = 0.88),
        bargap=0.1
    )
    # fig.write_image(f"plots/6.6 Gegenwartsbezug – {cont_comp_feature}.pdf")
    fig.show()
In [58]:
result_categories = ['wenn_nicht', 'wenn_ja', 'mannwhitneyu_p', 'pointbiserialr_corr',]

results_a = relations_bincont(
    meta = meta_anth_bin.query("1850 <= year <= 1884"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_b = relations_bincont(
    meta = meta_anth_bin.query("1885 <= year <= 1918"), 
    main_feature = main_feature,
    comp_features = results_filtered.index
)

results_merged = results_a[result_categories].join(
    results_b[result_categories], 
    lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']

round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[58]:
wenn_nicht_1850 wenn_ja_1850 mannwhitneyu_p_1850 pointbiserialr_corr_1850 wenn_nicht_1885 wenn_ja_1885 mannwhitneyu_p_1885 pointbiserialr_corr_1885 diff_of_corrs
zeitebenen 1.834 2.509 0.0 0.368 1.85 2.421 0.0 0.291 -0.077